This document will allow you to reproduce all supplementary figures.
Set working directory and load all necessary libraries and functions.
# If not installed, install and load the package "here", else: only load the package.
err <- try(library("here", character.only = TRUE), silent = TRUE)
## here() starts at /Users/lgoeminn/Documents/phD/MSqRobHurdlePaper
if (class(err) == 'try-error') {
install.packages("here", repos = "https://cloud.r-project.org")
library("here", character.only = TRUE)
}
wd <- here()
# Optional: change the working directory
setwd(wd)
# Load all functions and libraries
source(paste0(wd, "/R/functions.r"))
## Loading required namespace: BiocManager
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## ── Attaching packages ─────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.1 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.1
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
## Loading required package: mzR
## Loading required package: Rcpp
## Warning in fun(libname, pkgname): mzR has been built against a different Rcpp version (1.0.0)
## than is installed on your system (1.0.1). This might lead to errors
## when loading mzR. If you encounter such issues, please send a report,
## including the output of sessionInfo() to the Bioc support forum at
## https://support.bioconductor.org/. For details see also
## https://github.com/sneumann/mzR/wiki/mzR-Rcpp-compiler-linker-issue.
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
##
## first, rename
## The following object is masked from 'package:tidyr':
##
## expand
## The following object is masked from 'package:base':
##
## expand.grid
## Loading required package: ProtGenerics
##
## This is MSnbase version 2.8.3
## Visit https://lgatto.github.io/MSnbase/ to get started.
##
## Attaching package: 'MSnbase'
## The following object is masked from 'package:stats':
##
## smooth
## The following object is masked from 'package:base':
##
## trimws
##
## Attaching package: 'limma'
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:S4Vectors':
##
## expand
## The following object is masked from 'package:tidyr':
##
## expand
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: future
##
## Attaching package: 'future'
## The following object is masked from 'package:S4Vectors':
##
## values
## Loading required package: SummarizedExperiment
## Loading required package: GenomicRanges
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomeInfoDb
## Loading required package: DelayedArray
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
## The following objects are masked from 'package:Biobase':
##
## anyMissing, rowMedians
## The following object is masked from 'package:dplyr':
##
## count
## Loading required package: BiocParallel
##
## Attaching package: 'DelayedArray'
## The following objects are masked from 'package:matrixStats':
##
## colMaxs, colMins, colRanges, rowMaxs, rowMins, rowRanges
## The following object is masked from 'package:purrr':
##
## simplify
## The following objects are masked from 'package:base':
##
## aperm, apply
##
## Attaching package: 'stageR'
## The following object is masked from 'package:methods':
##
## getMethod
source(paste0(wd,"/R/plot_functions.R"))
Give session info for reproducibility.
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Mojave 10.14.5
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] MSqRob_0.7.6 stageR_1.3.29
## [3] SummarizedExperiment_1.10.1 DelayedArray_0.6.0
## [5] BiocParallel_1.14.2 matrixStats_0.54.0
## [7] GenomicRanges_1.32.3 GenomeInfoDb_1.16.0
## [9] IRanges_2.14.12 furrr_0.1.0.9002
## [11] future_1.13.0 readxl_1.3.1
## [13] colorspace_1.4-1 zoo_1.8-6
## [15] lme4_1.1-21 Matrix_1.2-17
## [17] limma_3.36.5 MSnbase_2.8.3
## [19] ProtGenerics_1.12.0 S4Vectors_0.18.3
## [21] mzR_2.16.1 Rcpp_1.0.1
## [23] Biobase_2.40.0 BiocGenerics_0.26.0
## [25] forcats_0.4.0 stringr_1.4.0
## [27] dplyr_0.8.1 purrr_0.3.2
## [29] readr_1.3.1 tidyr_0.8.3
## [31] tibble_2.1.3 ggplot2_3.1.1
## [33] tidyverse_1.2.1 MSstats_3.12.2
## [35] usethis_1.5.0 devtools_2.0.2
## [37] remotes_2.0.4 here_0.1
##
## loaded via a namespace (and not attached):
## [1] minqa_1.2.4 rprojroot_1.3-2 XVector_0.20.0
## [4] fs_1.3.1 rstudioapi_0.10 listenv_0.7.0
## [7] affyio_1.50.0 ggrepel_0.8.1 lubridate_1.7.4
## [10] xml2_1.2.0 codetools_0.2-16 splines_3.6.0
## [13] doParallel_1.0.14 ncdf4_1.16.1 impute_1.54.0
## [16] knitr_1.23 pkgload_1.0.2 jsonlite_1.6
## [19] nloptr_1.2.1 broom_0.5.2 vsn_3.48.1
## [22] BiocManager_1.30.4 compiler_3.6.0 httr_1.4.0
## [25] backports_1.1.4 assertthat_0.2.1 lazyeval_0.2.2
## [28] cli_1.1.0 htmltools_0.3.6 prettyunits_1.0.2
## [31] tools_3.6.0 GenomeInfoDbData_1.1.0 gtable_0.3.0
## [34] glue_1.3.1 affy_1.58.0 reshape2_1.4.3
## [37] MALDIquant_1.19.3 cellranger_1.1.0 gdata_2.18.0
## [40] preprocessCore_1.42.0 nlme_3.1-140 iterators_1.0.10
## [43] xfun_0.7 globals_0.12.4 ps_1.3.0
## [46] testthat_2.1.1 rvest_0.3.4 gtools_3.8.1
## [49] XML_3.98-1.20 zlibbioc_1.26.0 MASS_7.3-51.4
## [52] scales_1.0.0 BiocInstaller_1.30.0 pcaMethods_1.72.0
## [55] hms_0.4.2 doSNOW_1.0.16 yaml_2.2.0
## [58] memoise_1.1.0 stringi_1.4.3 desc_1.2.0
## [61] foreach_1.4.4 randomForest_4.6-14 caTools_1.17.1.2
## [64] boot_1.3-22 pkgbuild_1.0.3 rlang_0.3.4
## [67] pkgconfig_2.0.2 bitops_1.0-6 mzID_1.18.0
## [70] evaluate_0.14 lattice_0.20-38 processx_3.3.1
## [73] tidyselect_0.2.5 plyr_1.8.4 magrittr_1.5
## [76] R6_2.4.0 snow_0.4-3 gplots_3.0.1.1
## [79] generics_0.0.2 pillar_1.4.1 haven_2.1.0
## [82] withr_2.1.2 RCurl_1.95-4.12 survival_2.44-1.1
## [85] modelr_0.1.4 crayon_1.3.4 KernSmooth_2.23-15
## [88] rmarkdown_1.13 grid_3.6.0 minpack.lm_1.2-1
## [91] data.table_1.12.2 marray_1.58.0 callr_3.2.0
## [94] digest_0.6.19 munsell_0.5.0 sessioninfo_1.1.1
Important: make sure you have run the CPTAC and HEART analyses, or load the necessary files as follows:
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC2.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_KNN1.RData"))
load(paste0(wd, "/save_files_CPTAC/peptides_CPTAC_Perseus_imp.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_co_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_Hurdle.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_PI_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_KNN1_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_MSstats_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_full.RData"))
load(paste0(wd, "/save_files_CPTAC/CPTAC_Perseus_imp_full.RData"))
load(paste0(wd, "/save_files_CPTAC/res_CPTAC_ProPCA_full.RData"))
# testResultOneComparison is too big and cannot be put on our GitHub page, but can be obtained by running 7_analysis_CPTAC_MSstats.Rmd
load(paste0(wd, "/save_files_CPTAC/testResultOneComparison.RData"))
load(paste0(wd, "/save_files_PXD006675/peptides_HEART2.RData"))
load(paste0(wd, "/save_files_PXD006675/res_HEART_Hurdle.RData"))
load(paste0(wd, "/save_files_PXD006675/overlap_PHM.RData"))
We here show the sample-wise effect of kNN and Perseus imputation at the level of MaxQuant’s LFQ-values (protein level). Blue are the original values, red are the imputed values.
# Import non-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC) <- str_replace(sampleNames(proteinGroups.CPTAC), "LFQ.intensity.", "") %>% make.names
# Import Perseus-imputed data
file.proteinGroups <- paste0(wd,"/datasets/CPTAC/proteinGroups_preprocessed_imputed.txt")
exprs_col <- grepEcols(file.proteinGroups, "LFQ.intensity.", split = "\t")
proteinGroups.CPTAC.Perseus.imp <- readMSnSet2(file.proteinGroups, ecol = exprs_col, fnames = c("Protein.IDs"), sep = "\t")
sampleNames(proteinGroups.CPTAC.Perseus.imp) <- str_replace(sampleNames(proteinGroups.CPTAC.Perseus.imp), "LFQ.intensity.", "") %>% make.names
sample.names <- paste0(sampleNames(proteinGroups.CPTAC) %>% substr(3, 3), sampleNames(proteinGroups.CPTAC) %>% substr(5, 5))
### Impute with KNN-1
proteinGroups.CPTAC.KNN1 <- impute(proteinGroups.CPTAC, method = "knn")
## Warning in knnimp(x, k, maxmiss = rowmax, maxp = maxp): 700 rows with more than 50 % entries missing;
## mean imputation used for these rows
cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))
# 1. Histograms for the effect of Perseus imputation LFQ-level (Supplementary Fig. 1)
# grDevices::svg(paste0(wd,"/Perseus_protein_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
for(i in 1:ncol(exprs(proteinGroups.CPTAC.Perseus.imp))){
p1 <- hist(exprs(proteinGroups.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(LFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
# 2. Histograms for the effect of kNN imputation at LFQ-level (Supplementary Fig. 2)
# grDevices::svg(paste0(wd,"/kNN_protein_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
for(i in 1:ncol(exprs(proteinGroups.CPTAC.KNN1))){
p1 <- hist(exprs(proteinGroups.CPTAC.KNN1)[,i], breaks=25, plot = FALSE)
p2 <- hist(exprs(proteinGroups.CPTAC)[,i], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation sample ", sample.names[i]), xlab = "log2(LFQ intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
We here show the sample-wise effect of kNN and Perseus imputation at the peptide level. Blue are the original values, red are the imputed values.
# 3. Histograms for the effect of Perseus imputation at peptide-level (Supplementary Fig. 3)
# grDevices::svg(paste0(wd,"/Perseus_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
for(i in 1:ncol(exprs(peptides.CPTAC.Perseus.imp))){
p1 <- hist(exprs(peptides.CPTAC.Perseus.imp[,i]), breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2[,i]), breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(12,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("Perseus imputation sample ", sample.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(12,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
# 4. Histograms for the effect of kNN imputation at peptide-level (Supplementary Fig. 4)
sample.names <- paste0(sampleNames(peptides.CPTAC.KNN1) %>% substr(3, 3), sampleNames(peptides.CPTAC.KNN1) %>% substr(5, 5))
cex.main <- 1.2
cex.lab <- 1
cex <- 1
par(mfrow=c(1,1))
# grDevices::svg(paste0(wd,"/kNN_sample_imp.svg"), width=60, height=90)
# cex.main <- 10
# cex.lab <- 8
# cex <- 6
# par(mfrow=c(1,3))
# par(mar=c(5.1*cex/2,4.1*cex/2,4.1*cex/2,2.1*cex/2))
# par(mfrow=c(9,3))
for(i in 1:ncol(exprs(peptides.CPTAC.KNN1))){
p1 <- hist(exprs(peptides.CPTAC.KNN1)[,i], breaks=25, plot = FALSE)
p2 <- hist(exprs(peptides.CPTAC2)[,i], breaks=25, plot = FALSE)
plot(p1, col=rgb(255,0,0, maxColorValue = 255), xlim=c(13,28), border=rgb(255,165,0, maxColorValue = 255), las = 1, main = paste0("kNN imputation sample ", sample.names[i]), xlab = "log2(peptide intensity)", cex.lab=cex.lab, cex.axis=cex, cex.main=cex.main, cex.sub=cex, lwd=cex) # first histogram
plot(p2, col=rgb(0,0,255, maxColorValue = 255), xlim=c(13,28), border=rgb(0,165,255, maxColorValue = 255), add=TRUE) # second histogram
}
# dev.off()
# FDR-FTP plot including MSstats, MSstats with Inf and kNN, based on our preprocessing
res.list <- list(res.CPTAC.full,
res.CPTAC.co.full,
res.CPTAC.Hurdle,
res.CPTAC.KNN1.full,
res.CPTAC.PI.full,
CPTAC.Perseus.full,
CPTAC.Perseus.imp.full,
res.CPTAC.MSstats.full,
res.CPTAC.MSstats.full[!is.infinite(res.CPTAC.MSstats.full$log2FC),],
res.CPTAC.ProPCA.full
)
contrast.vec <- c("condition6B-condition6A",
"condition6C-condition6B",
"condition6C-condition6A")
names(contrast.vec) <- c("Condition 6B vs. 6A",
"Condition 6C vs. 6B",
"Condition 6C vs. 6A")
colors <- c("#FF681E", "#E15E9E", "#5A2A82", "blue", "green", "#1B2944", "#42B7BA", "yellow3", "peru", "red2") # darkblue: "#1B2944" gray: "#50FF00"
sort.list <- list(c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logOR"),
c("qchisq", "pchisq", "chisq", NA),
c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logFC"),
c("q.value", "p.value", "Test.statistic", "Difference"),
c("q.value", "p.value", "Test.statistic", "Difference"),
c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
c("adj.P.Val", "P.Value", "t", "logFC"))
PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) #TRUE
# Since in our preprocessing, 2 significant proteins were removed from the MSstats output, we also redo the whole thing with MSstats preprocessing as a reference.
res.CPTAC.MSstats <- as_tibble(testResultOneComparison$ComparisonResult)
res.CPTAC.MSstats <- res.CPTAC.MSstats %>% dplyr::rename(protein = Protein, contrast = Label)
res.CPTAC.MSstats$contrast <- res.CPTAC.MSstats$contrast %>% as.character
res.CPTAC.MSstats$contrast[res.CPTAC.MSstats$contrast == "6B-6A"] <- "condition6B-condition6A"
res.CPTAC.MSstats$contrast[res.CPTAC.MSstats$contrast == "6C-6A"] <- "condition6C-condition6A"
res.CPTAC.MSstats$contrast[res.CPTAC.MSstats$contrast == "6C-6B"] <- "condition6C-condition6B"
res.CPTAC.Base2 <- res.CPTAC.MSstats %>% select(protein, contrast)
res.CPTAC.Base2$UPS <- grepl("ups", res.CPTAC.Base2$protein)
res.CPTAC.MSstats.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.MSstats)
## Joining, by = c("protein", "contrast")
res.CPTAC.MSstats.full2 <- res.CPTAC.MSstats.full2 %>% arrange(adj.pvalue,pvalue)
res.CPTAC.MSstats.noInf.full2 <- res.CPTAC.MSstats.full2
res.CPTAC.MSstats.noInf.full2[is.infinite(res.CPTAC.MSstats.noInf.full2$log2FC),][,c("log2FC", "SE", "Tvalue", "DF", "pvalue", "adj.pvalue")] <- NA
res.CPTAC.MSstats.noInf.full2 <- res.CPTAC.MSstats.noInf.full2 %>% arrange(adj.pvalue,pvalue)
# Perseus without imputation #
# 1. Import Perseus results
results.Perseus <- read.table(paste0(wd,"/datasets/CPTAC/t-tests_Perseus_1_6_0_7-6A-6B-6C.txt"), sep="\t", header=TRUE, quote = "", comment.char = "")
results.Perseus <- results.Perseus[-c(1,2),] %>% map_dfr(~{.x %>% as.character %>% type.convert})
# 2. Convert wide to long
results.Perseus <- results.Perseus[,43:58]
colnames(results.Perseus) <- gsub("Student.s.T.test.","",colnames(results.Perseus))
res.Perseus <- results.Perseus %>% reshape(timevar = "contrast", varying =,1:12, direction = "long", sep = ".6")
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
res.Perseus$contrast[res.Perseus$contrast == "B_6A"] <- "condition6B-condition6A"
res.Perseus$contrast[res.Perseus$contrast == "C_6A"] <- "condition6C-condition6A"
res.Perseus$contrast[res.Perseus$contrast == "C_6B"] <- "condition6C-condition6B"
res.Perseus <- res.Perseus %>% dplyr::rename(protein = Majority.protein.IDs, protein.name = Protein.names, gene.name = Gene.names) %>% select(-Protein.IDs)
UPS.Perseus <- unique(res.Perseus$protein)[grepl("ups", unique(res.Perseus$protein))]
UPS.MSstats <- unique(res.CPTAC.MSstats$protein)[grepl("ups", unique(res.CPTAC.MSstats$protein))]
UPS.Hurdle <- unique(res.CPTAC.Hurdle$protein)[grepl("ups", unique(res.CPTAC.Hurdle$protein))]
UPS.MSstats[!(UPS.MSstats %in% UPS.Perseus)]
## [1] P01008ups;CON__P41361 P68871ups;CON__P02070;CON__Q3SX09
## [3] P99999ups;CON__P62894
## 1446 Levels: A5Z2X5 D6VTK4 O00762ups O13516 O13547 O13563 O14455 ... Q99385
# P01008ups;CON__P41361 P68871ups;CON__P02070;CON__Q3SX09 P99999ups;CON__P62894
UPS.Perseus[!(UPS.Perseus %in% UPS.MSstats)]
## [1] P01008ups P68871ups P99999ups
## 1462 Levels: A5Z2X5 D6VTK4 O00762ups O13516 O13547 O13563 O14455 ... Q99385
# P01008ups P68871ups P99999ups
# -> all three of them link perfectly with each other, thus we will fill them in
CPTAC.Perseus.full2 <- res.CPTAC.Base2 %>% left_join(res.Perseus)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
CPTAC.Perseus.full2[CPTAC.Perseus.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus[res.Perseus$protein == "P01008ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]
CPTAC.Perseus.full2[CPTAC.Perseus.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus[res.Perseus$protein == "P68871ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]
CPTAC.Perseus.full2[CPTAC.Perseus.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus[res.Perseus$protein == "P99999ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]
CPTAC.Perseus.full2 <- CPTAC.Perseus.full2 %>% arrange(p.value)
# Perseus with imputation #
results.Perseus.imp <- read.table(paste0(wd,"/datasets/CPTAC/t-tests_Perseus_imputed_1_6_0_7-6A-6B-6C.txt"), sep="\t", header=TRUE, quote = "", comment.char = "")
results.Perseus.imp <- results.Perseus.imp[-c(1,2),] %>% map_dfr(~{.x %>% as.character %>% type.convert})
# 2. Convert wide to long
results.Perseus.imp <- results.Perseus.imp[,43:58]
colnames(results.Perseus.imp) <- gsub("Student.s.T.test.","",colnames(results.Perseus.imp))
res.Perseus.imp <- results.Perseus.imp %>% reshape(timevar = "contrast", varying =,1:12, direction = "long", sep = ".6")
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
## Warning: Setting row names on a tibble is deprecated.
res.Perseus.imp$contrast[res.Perseus.imp$contrast == "B_6A"] <- "condition6B-condition6A"
res.Perseus.imp$contrast[res.Perseus.imp$contrast == "C_6A"] <- "condition6C-condition6A"
res.Perseus.imp$contrast[res.Perseus.imp$contrast == "C_6B"] <- "condition6C-condition6B"
res.Perseus.imp <- res.Perseus.imp %>% dplyr::rename(protein = Majority.protein.IDs, protein.name = Protein.names, gene.name = Gene.names) %>% select(-Protein.IDs)
CPTAC.Perseus.imp.full2 <- res.CPTAC.Base2 %>% left_join(res.Perseus.imp)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
CPTAC.Perseus.imp.full2[CPTAC.Perseus.imp.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus.imp[res.Perseus.imp$protein == "P01008ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]
CPTAC.Perseus.imp.full2[CPTAC.Perseus.imp.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus.imp[res.Perseus.imp$protein == "P68871ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]
CPTAC.Perseus.imp.full2[CPTAC.Perseus.imp.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")] <- res.Perseus.imp[res.Perseus.imp$protein == "P99999ups",][,c("contrast", "p.value", "q.value", "Difference", "Test.statistic")]
CPTAC.Perseus.imp.full2 <- CPTAC.Perseus.imp.full2 %>% arrange(p.value)
# Hurdle model #
UPS.MSstats[!(UPS.MSstats %in% UPS.Hurdle)]
## [1] P01008ups;CON__P41361 P01112ups
## [3] P09211ups P62988ups
## [5] P68871ups;CON__P02070;CON__Q3SX09 P99999ups;CON__P62894
## 1446 Levels: A5Z2X5 D6VTK4 O00762ups O13516 O13547 O13563 O14455 ... Q99385
# P01008ups;CON__P41361 P01112ups P09211ups P62988ups P68871ups;CON__P02070;CON__Q3SX09 P99999ups;CON__P62894
UPS.Hurdle[!(UPS.Hurdle %in% UPS.MSstats)]
## [1] P99999ups P01008ups P68871ups
## [4] P05759;P0CG63;P62988ups
## 1655 Levels: A5Z2X5 ... Q99385
# P99999ups P01008ups P68871ups P05759;P0CG63;P62988ups
# -> everything except P01112ups and P09211ups link up
# => insert the estimates of the 4 others
res.CPTAC.Hurdle2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.Hurdle)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]
res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]
res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]
res.CPTAC.Hurdle2[res.CPTAC.Hurdle2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")] <- res.CPTAC.Hurdle[res.CPTAC.Hurdle$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "logOR", "chisq", "pchisq", "qchisq")]
res.CPTAC.Hurdle2 <- res.CPTAC.Hurdle2 %>% arrange(pchisq)
# MSqRob without preprocessing #
res.CPTAC.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.full2[res.CPTAC.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.full2[res.CPTAC.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.full2[res.CPTAC.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.full2[res.CPTAC.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.full[res.CPTAC.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.full2 <- res.CPTAC.full2 %>% arrange(pvalue)
# Quasibinomial model #
res.CPTAC.co.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.co.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.co.full2[res.CPTAC.co.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.co.full[res.CPTAC.co.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logOR", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.co.full2 <- res.CPTAC.co.full2 %>% arrange(pvalue)
# MSqRob with kNN imputation #
res.CPTAC.KNN1.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.KNN1.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.KNN1.full2[res.CPTAC.KNN1.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.KNN1.full[res.CPTAC.KNN1.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.KNN1.full2 <- res.CPTAC.KNN1.full2 %>% arrange(pvalue)
# MSqRob with Perseus imputation #
res.CPTAC.PI.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.PI.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.PI.full2[res.CPTAC.PI.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")] <- res.CPTAC.PI.full[res.CPTAC.PI.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "se", "t", "df", "pvalue", "qvalue")]
res.CPTAC.PI.full2 <- res.CPTAC.PI.full2 %>% arrange(pvalue)
# ProPCA with limma #
res.CPTAC.ProPCA.full
## # A tibble: 4,143 x 11
## protein UPS gene.name protein.name contrast logFC AveExpr t
## <chr> <lgl> <fct> <fct> <chr> <dbl> <dbl> <dbl>
## 1 Q06830… TRUE "" "" conditi… 0.552 1.91 32.9
## 2 Q06830… TRUE "" "" conditi… 0.313 1.91 18.7
## 3 Q06830… TRUE "" "" conditi… 0.238 1.91 14.2
## 4 P12081… TRUE "" "" conditi… 0.822 1.52 9.85
## 5 P10636… TRUE "" "" conditi… 1.00 2.12 9.45
## 6 P00915… TRUE "" "" conditi… 0.336 0.633 7.45
## 7 P12081… TRUE "" "" conditi… 0.611 1.52 7.32
## 8 P02144… TRUE "" "" conditi… 0.159 0.877 7.00
## 9 P10145… TRUE "" "" conditi… 0.145 0.876 6.89
## 10 P55957… TRUE "" "" conditi… 0.593 0.688 6.65
## # … with 4,133 more rows, and 3 more variables: P.Value <dbl>,
## # adj.P.Val <dbl>, B <dbl>
res.CPTAC.ProPCA.full2 <- res.CPTAC.Base2 %>% left_join(res.CPTAC.ProPCA.full)
## Joining, by = c("protein", "contrast", "UPS")
## Warning: Column `protein` joining factor and character vector, coercing
## into character vector
res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P99999ups;CON__P62894",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P99999ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]
res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P01008ups;CON__P41361",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P01008ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]
res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P68871ups;CON__P02070;CON__Q3SX09",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P68871ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]
res.CPTAC.ProPCA.full2[res.CPTAC.ProPCA.full2$protein == "P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")] <- res.CPTAC.ProPCA.full[res.CPTAC.ProPCA.full$protein == "P05759;P0CG63;P62988ups",][,c("contrast", "gene.name", "protein.name", "logFC", "AveExpr", "P.Value", "adj.P.Val", "B")]
res.CPTAC.ProPCA.full2 <- res.CPTAC.ProPCA.full2 %>% arrange(P.Value)
### FDR-FTP plots with MSstats preprocessing as a basis (Supplementary Fig. 6) ###
res.list <- list(res.CPTAC.full2,
res.CPTAC.co.full2,
res.CPTAC.Hurdle2,
res.CPTAC.KNN1.full2,
res.CPTAC.PI.full2,
CPTAC.Perseus.full2,
CPTAC.Perseus.imp.full2,
res.CPTAC.MSstats.full2,
res.CPTAC.MSstats.full2[!is.infinite(res.CPTAC.MSstats.full2$log2FC),],
res.CPTAC.ProPCA.full2
)
contrast.vec <- c("condition6B-condition6A",
"condition6C-condition6B",
"condition6C-condition6A")
names(contrast.vec) <- c("Condition 6B vs. 6A",
"Condition 6C vs. 6B",
"Condition 6C vs. 6A")
colors <- c("#FF681E", "#E15E9E", "#5A2A82", "blue", "green", "#1B2944", "#42B7BA", "yellow3", "peru", "red2") # darkblue: "#1B2944" gray: "#50FF00"
sort.list <- list(c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logOR"),
c("qchisq", "pchisq", "chisq", NA),
c("qvalue", "pvalue", "t", "logFC"),
c("qvalue", "pvalue", "t", "logFC"),
c("q.value", "p.value", "Test.statistic", "Difference"),
c("q.value", "p.value", "Test.statistic", "Difference"),
c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
c("adj.pvalue", "pvalue", "Tvalue", "log2FC"),
c("adj.P.Val", "P.Value", "t", "logFC"))
PlotFDPTPR(res.list, contrast.vec, colors, sort.list, TPcol = c("UPS"), plotSVG = FALSE) #TRUE
# Make the plots
# BA, CB, CA
upratio <- c(1.565597, 1.571906, 3.137504)
# 1. Top: condition 6B vs. 6A
TP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & UPS)
FP.BvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6B-condition6A" & !UPS)
scatterHist(x = ((FP.BvsA$logOR) %>% unlist),
y = ((FP.BvsA$logFC) %>% unlist),
x2 = ((TP.BvsA$logOR) %>% unlist),
y2 = ((TP.BvsA$logFC) %>% unlist),
main = "Condition B vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistBA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistBA.svg")
sum(is.na(((FP.BvsA$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.BvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate
# 2. Middle: condition 6C vs. 6A
TP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & UPS)
FP.CvsA <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6A" & !UPS)
scatterHist(x = ((FP.CvsA$logOR) %>% unlist),
y = ((FP.CvsA$logFC) %>% unlist),
x2 = ((TP.CvsA$logOR) %>% unlist),
y2 = ((TP.CvsA$logFC) %>% unlist),
main = "Condition C vs. A", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCA.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCA.svg")
sum(is.na(((FP.CvsA$logFC) %>% unlist)))
## [1] 158
# 158 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsA$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate
# 3. Bottom: condition 6C vs. 6B
TP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & UPS)
FP.CvsB <- res.CPTAC.Hurdle %>% filter(contrast == "condition6C-condition6B" & !UPS)
scatterHist(x = ((FP.CvsB$logOR) %>% unlist),
y = ((FP.CvsB$logFC) %>% unlist),
x2 = ((TP.CvsB$logOR) %>% unlist),
y2 = ((TP.CvsB$logFC) %>% unlist),
main = "Condition C vs. B", plotSVG = FALSE, filename = paste0(wd,"/scatterhistCB.svg")) #, plotSVG = TRUE, filename = paste0(wd,"/scatterhistCB.svg")
sum(is.na(((FP.CvsB$logFC) %>% unlist)))
## [1] 159
# 159 yeast proteins without a log2FC estimate
sum(is.na(((TP.CvsB$logFC) %>% unlist)))
## [1] 2
# 2 UPS1 proteins without a log2FC estimate
Aditionally, we here provide the plots for the underlying (preprocessed) peptide-level data for the 1500 most significant proteins for each method in the HEART dataset, grouped per method.
### First 1500 most significant genes ###
# A. Select top 1500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1500]
# B. Select top 1500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1500 <- hurdle.AvsV.ov[1:1500,]$gene.name
# C. Select top 1500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1500 <- MSqRob.AvsV.ov[1:1500,]$gene.name
### Overlap all (Supplementary Fig. 8) ###
plot_proteins(gene.names = "MYL7", title = "MYL7", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "PAM", title = "PAM", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "MYBPHL", title = "MYBPHL", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "MAP1B", title = "MAP1B", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("MYL7", "PAM", "MYBPHL", "MAP1B"), title = c("overlap_all.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from all three methods
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & (genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_all.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Overlap Hurdle and Perseus (Supplementary Fig. 9) ###
plot_proteins(gene.names = "CA13", title = "CA13", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "NTN1", title = "NTN1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "PRKG2", title = "PRKG2", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "SBF2", title = "SBF2", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("CA13", "NTN1", "PRKG2", "SBF2"), title = c("overlap_hurdle_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and Perseus and against MSqRob
genes <- genes.Hurdle.AvsV.1500[(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_hurdle_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Hurdle alone (Supplementary Fig. 10) ###
plot_proteins(gene.names = "HTT", title = "HTT", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "ACACA", title = "ACACA", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "ABCA6", title = "ABCA6", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "FTCD", title = "FTCD", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("HTT", "ACACA", "ABCA6", "FTCD"), title = c("only_hurdle.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Hurdle and against MSqRob and Perseus
genes <- genes.Hurdle.AvsV.1500[!(genes.Hurdle.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Hurdle.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Perseus alone (Supplementary Fig. 11) ###
plot_proteins(gene.names = "GJA5", title = "GJA5", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "ASNSD1", title = "ASNSD1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "MTCL1", title = "MTCL1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "BLOC1S5;BLOC1S5-TXNDC5", title = "BLOC1S5;BLOC1S5-TXNDC5", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("GJA5", "ASNSD1", "MTCL1", "BLOC1S5;BLOC1S5-TXNDC5"), title = c("only_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from Perseus and against Hurdle and MSqRob
genes <- genes.Perseus.AvsV.1500[!(genes.Perseus.AvsV.1500 %in% genes.MSqRob.AvsV.1500) & !(genes.Perseus.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm((1-MSqRob.pvals)/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Overlap MSqRob and Perseus (Supplementary Fig. 12) ###
plot_proteins(gene.names = "AK4", title = "AK4", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "NDUFV1", title = "NDUFV1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "DECR1", title = "DECR1", msnset = peptides.HEART2, pdf = FALSE)
plot_proteins(gene.names = "SDHA", title = "SDHA", msnset = peptides.HEART2, pdf = FALSE)
# plot_proteinsSVG(gene.names = c("AK4", "NDUFV1", "DECR1", "SDHA"), title = c("overlap_MSqRob_Perseus.svg"), msnset = peptides.HEART2)
# Order p-values by evidence from MSqRob and Perseus and against Hurdle
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm(Perseus.pvals/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Perseus.pdf", msnset = peptides.HEART2, pdf = TRUE)
### MSqRob alone ###
# Order p-values by evidence from MSqRob and against Hurdle and Perseus
genes <- genes.MSqRob.AvsV.1500[!(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm((1-hurdle.pvals)/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "only_MSqRob.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Overlap MSqRob and Hurdle ###
# Order p-values by evidence from MSqRob and Hurdle and against Perseus
genes <- genes.MSqRob.AvsV.1500[(genes.MSqRob.AvsV.1500 %in% genes.Hurdle.AvsV.1500) & !(genes.MSqRob.AvsV.1500 %in% genes.Perseus.AvsV.1500)] %>% as.character
Mdata <- MSqRob.AvsV.ov[MSqRob.AvsV.ov$gene.name %in% genes,]
Pdata <- Perseus.AvsV.ov[Perseus.AvsV.ov$`Gene names` %in% genes,]
Hdata <- hurdle.AvsV.ov[hurdle.AvsV.ov$gene.name %in% genes,]
MSqRob.pvals <- Mdata$pvalue
names(MSqRob.pvals) <- Mdata$gene.name
MSqRob.pvals <- MSqRob.pvals[genes]
MSqRob.pvals[is.na(MSqRob.pvals)] <- 1
Perseus.pvals <- 10^(-Pdata$`minusLOG(P-value)`)
names(Perseus.pvals) <- Pdata$`Gene names`
Perseus.pvals <- Perseus.pvals[genes]
Perseus.pvals[is.na(Perseus.pvals)] <- 1
hurdle.pvals <- Hdata$pchisq
names(hurdle.pvals) <- Hdata$gene.name
hurdle.pvals <- hurdle.pvals[genes]
hurdle.pvals[is.na(hurdle.pvals)] <- 1
genes.ordered <- qnorm(MSqRob.pvals/2)^2+qnorm((1-Perseus.pvals)/2)^2+qnorm(hurdle.pvals/2)^2
genes.ordered <- sort(genes.ordered, decreasing = TRUE)
# plot_proteins(gene.names = names(genes.ordered), title = "overlap_MSqRob_Hurdle.pdf", msnset = peptides.HEART2, pdf = TRUE)
### Venn diagrams atrial vs. ventricular region ###
### Checks: ###
all(Perseus.AvsV.ov$`Gene names` %in% hurdle.AvsV.ov$gene.name)
## [1] TRUE
all(Perseus.AvsV.ov$`Gene names` %in% MSqRob.AvsV.ov$gene.name)
## [1] TRUE
all(hurdle.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
all(MSqRob.AvsV.ov$gene.name %in% Perseus.AvsV.ov$`Gene names`)
## [1] TRUE
dim(MSqRob.AvsV.ov)
## [1] 7822 9
dim(hurdle.AvsV.ov)
## [1] 7822 8
length(unique(Perseus.AvsV.ov$`Gene names`))
## [1] 7822
################
### Top: the first 500 significant genes ###
# A. Select top 500 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.500 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:500]
# B. Select top 500 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.500 <- hurdle.AvsV.ov[1:500,]$gene.name
# C. Select top 500 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.500 <- MSqRob.AvsV.ov[1:500,]$gene.name
# MSqRob alone
sum(!(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 99
# 99
# Hurdle alone
sum(!(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 85
# 85
# Perseus alone
sum(!(genes.Perseus.AvsV.500 %in% genes.MSqRob.AvsV.500) & !(genes.Perseus.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218
# Overlap MSqRob and Hurdle
sum((genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500))
## [1] 158
# 158
# Overlap MSqRob and Perseus
sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 25
# 25
# Overlap Hurdle and Perseus
sum((genes.Hurdle.AvsV.500 %in% genes.Perseus.AvsV.500) & !(genes.Hurdle.AvsV.500 %in% genes.MSqRob.AvsV.500))
## [1] 39
# 39
# Overlap everything
sum((genes.MSqRob.AvsV.500 %in% genes.Perseus.AvsV.500) & (genes.MSqRob.AvsV.500 %in% genes.Hurdle.AvsV.500))
## [1] 218
# 218
# Control:
length(genes.MSqRob.AvsV.500)
## [1] 500
99+218+158+25
## [1] 500
length(genes.Hurdle.AvsV.500)
## [1] 500
85+158+218+39
## [1] 500
length(genes.Perseus.AvsV.500)
## [1] 500
218+39+25+218
## [1] 500
### Bottom: the first 1000 significant genes ###
# A. Select top 1000 DA gene identifiers for the Perseus analysis of ventricles vs. atria
genes.Perseus.AvsV.1000 <- (Perseus.AvsV.ov %>% pull(`Gene names`) %>% unique)[1:1000]
# B. Select top 1000 DA gene identifiers for the Hurdle analysis of atria vs. ventricles
genes.Hurdle.AvsV.1000 <- hurdle.AvsV.ov[1:1000,]$gene.name
# C. Select top 1000 DA gene identifiers for the MSqRob analysis of atria vs. ventricles
genes.MSqRob.AvsV.1000 <- MSqRob.AvsV.ov[1:1000,]$gene.name
# MSqRob alone
sum(!(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 174
# 174
# Hurdle alone
sum(!(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 134
# 134
# Perseus alone
sum(!(genes.Perseus.AvsV.1000 %in% genes.MSqRob.AvsV.1000) & !(genes.Perseus.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 439
# 439
# Overlap MSqRob and Hurdle
sum((genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000))
## [1] 363
# 363
# Overlap MSqRob and Perseus
sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 58
# 58
# Overlap Hurdle and Perseus
sum((genes.Hurdle.AvsV.1000 %in% genes.Perseus.AvsV.1000) & !(genes.Hurdle.AvsV.1000 %in% genes.MSqRob.AvsV.1000))
## [1] 98
# 98
# Overlap everything
sum((genes.MSqRob.AvsV.1000 %in% genes.Perseus.AvsV.1000) & (genes.MSqRob.AvsV.1000 %in% genes.Hurdle.AvsV.1000))
## [1] 405
# 405
# Control:
length(genes.MSqRob.AvsV.1000)
## [1] 1000
174+363+58+405
## [1] 1000
length(genes.Hurdle.AvsV.1000)
## [1] 1000
134+363+98+405
## [1] 1000
length(genes.Perseus.AvsV.1000)
## [1] 1000
439+58+98+405
## [1] 1000
# Create a mock experimental design where there is no differential abundance in reality
# There is however a huge "batch effect" of "condlab", the combined effects of spike-in condition and lab
pData <- pData(peptides.CPTAC2)
pData$condlab <- paste0(pData$condition,pData$lab)
pData$treat <- rep(c("one","two","three"),9)
pData$treat <- factor(pData$treat, levels = c("one", "two", "three"))
peptides.CPTAC2.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC2), fData = AnnotatedDataFrame(fData(peptides.CPTAC2)), pData = AnnotatedDataFrame(pData))
### Recreate peptides.CPTAC3 ###
# We require by default at least 2 identifications of a peptide sequence, as with 1 identification, the model will be perfectly confounded
sel <- rowSums(!is.na(Biobase::exprs(peptides.CPTAC2))) >= 2
peptides.CPTAC3 <- peptides.CPTAC2[sel]
# Again remove proteins that are only identified by one peptide
sel <- fData(peptides.CPTAC3) %>% group_by(protein) %>% mutate(flag = n() > 1) %>% pull(flag)
peptides.CPTAC3 <- peptides.CPTAC3[sel]
# Drop levels
peptides.CPTAC3 <- MSnbase::MSnSet(exprs = Biobase::exprs(peptides.CPTAC3), fData = droplevels(Biobase::fData(peptides.CPTAC3)), pData = Biobase::pData(peptides.CPTAC3))
dim(peptides.CPTAC2)
## [1] 9377 27
# 9377 27
dim(peptides.CPTAC3)
## [1] 9158 27
# 9158 27
length(unique(fData(peptides.CPTAC2)$protein))
## [1] 1381
# 1381
length(unique(fData(peptides.CPTAC3)$protein))
## [1] 1347
# 1347
rm(sel)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 14272080 762.3 24211394 1293.1 NA 20215918 1079.7
## Vcells 51878250 395.8 81269643 620.1 16384 67657195 516.2
######
peptides.CPTAC3.noeff <- MSnSet(Biobase::exprs(peptides.CPTAC3), fData = AnnotatedDataFrame(fData(peptides.CPTAC3)), pData = AnnotatedDataFrame(pData))
# Create contrast matrix for the mock analysis
form <- expression ~ (1|treat) + (1|condlab) + (1|sample) + (1|sequence)
contrasts <- contrast_helper(form, peptides.CPTAC3.noeff, treat)
contrasts <- contrasts[c(1,3,2),c(2,1,3)]
colnames(contrasts)[3] <- "treatthree-treattwo"
contrasts[c(2,3),3] <- c(-1,1)
contrasts
## treattwo-treatone treatthree-treatone treatthree-treattwo
## treatone -1 -1 0
## treattwo 1 0 -1
## treatthree 0 1 1
# Fit the models, this might take a few minutes
# res.CPTAC.noeff <- do_mm(formula = form, msnset = peptides.CPTAC3.noeff, group_var = protein, contrasts = contrasts, type_df = "conservative", max_iter = 20)
# Or load the data
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff.RData"))
annotation.CPTAC <- tibble(protein = fData(peptides.CPTAC2)$protein, UPS = grepl("ups",fData(peptides.CPTAC2)$protein), gene.name = fData(peptides.CPTAC2)$gene.name, protein.name = fData(peptides.CPTAC2)$protein.name) %>% distinct
res.CPTAC.Base <- rbind(annotation.CPTAC, annotation.CPTAC, annotation.CPTAC)
res.CPTAC.Base$contrast <- c(rep("treattwo-treatone",nrow(annotation.CPTAC)), rep("treatthree-treatone",nrow(annotation.CPTAC)), rep("treatthree-treattwo",nrow(annotation.CPTAC)))
res.CPTAC.noeff.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff$result)
## Joining, by = c("protein", "contrast")
## Warning: Column `protein` joining factors with different levels, coercing
## to character vector
res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% arrange(pvalue)
prot.CPTAC.noeff.co <- do_count_groups(peptides.CPTAC2.noeff, group_var = protein, keep_fData_cols = c("gene.name","protein.name"))
# Remove estimates based on only one sample in one or more conditions from the data
not_one <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(1,25, by=3)] > 0) %>% rowSums < 2)
not_two <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(2,26, by=3)] > 0) %>% rowSums < 2)
not_three <- which((Biobase::exprs(prot.CPTAC.noeff.co)[,seq(3,27, by=3)] > 0) %>% rowSums < 2)
### Important: re-adjust the false discovery rate! ###
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_one)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treatone")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_two)) & (res.CPTAC.noeff.full$contrast %in% c("treattwo-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full[(res.CPTAC.noeff.full$protein %in% names(not_three)) & (res.CPTAC.noeff.full$contrast %in% c("treatthree-treatone", "treatthree-treattwo")),][,c("logFC", "se", "t", "df", "pvalue", "qvalue")] <- NA
res.CPTAC.noeff.full <- res.CPTAC.noeff.full %>% group_by(contrast) %>%
mutate(qvalue = p.adjust(pvalue, method = "BH"))
# Or load the MSqRob result object via:
# load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_full.RData"))
### Fit the quasi-binomial model ###
form.co <- ~ treat + condlab
# contrasts <- contrast_helper(~ treat, peptides.CPTAC2.noeff, treat)
# res.CPTAC.noeff.co <- do_glm(formula = form.co, msnset = prot.CPTAC.noeff.co, group_var = protein, contrasts = contrasts, contFun = "contEst")
# OR: load the model:
load(paste0(wd, "/save_files_mock_CPTAC/res_CPTAC_noeff_co.RData"))
res.CPTAC.noeff.co.full <- res.CPTAC.Base %>% left_join(res.CPTAC.noeff.co$result)
## Joining, by = c("protein", "contrast")
res.CPTAC.noeff.co.full <- res.CPTAC.noeff.co.full %>% arrange(pvalue)
# cols <- colorRampPalette(c("#FF3100", "#FCFF00", "#45FE4F",
# "#00FEFF", "#000099"
# ))(256)
# col = cols[ceiling(rank(HEART_A_vsV$pchisq)/length(HEART_A_vsV$pchisq)*256)]
# Order results according to protein and contrast instead of p-value
res.intensities.sorted <- res.CPTAC.noeff.full %>% arrange(contrast, protein)
res.counts.sorted <- res.CPTAC.noeff.co.full %>% arrange(contrast, protein)
all(res.intensities.sorted$protein == res.counts.sorted$protein)
## [1] TRUE
all(res.intensities.sorted$contrast == res.counts.sorted$contrast)
## [1] TRUE
# library(svglite)
# svglite(file = "all_mock_correlations.svg", width = 14, height = 14)
# # svglite(file = "no_correlation_MSqRob_qbinom_21.svg", width = 7, height = 7)
# par(mar=c(5.1,6.1,4.1,2.1))
# par(mfrow = c(2,2))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treattwo-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treattwo-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 2 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")
# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
#
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_31.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treatone",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treatone",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 1", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")
# # par(mar=c(5.1,4.1,4.1,2.1))
# # dev.off()
#
# # library(svglite)
# # svglite(file = "no_correlation_MSqRob_qbinom_32.svg", width = 7, height = 7)
# # par(mar=c(5.1,6.1,4.1,2.1))
plot(res.intensities.sorted[res.intensities.sorted$contrast == "treatthree-treattwo",]$logFC, res.counts.sorted[res.counts.sorted$contrast == "treatthree-treattwo",]$logOR, col = c("black","red")[as.numeric(res.intensities.sorted$UPS)+1], pch = 20, xlab = "MSqRob log2FC", ylab = "Quasi-binomial log2OR", las = 1, frame.plot = FALSE, main = "Nonsense treatment 3 vs. 2", cex.lab = 2, cex.axis = 2, cex.main = 2) #[1:1000]
abline(v = 0, col = "black")
abline(h = 0, col = "black")
# # par(mar=c(5.1,4.1,4.1,2.1))
# par(mfrow = c(1,1))
# dev.off()